library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(leaflet)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readxl)

knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Read in wine data.

year_extract <- function(string) {
  t <- regmatches(string, regexec("[1-2][9|0][0-9][0-9]", string))
  sapply(t, function(x) {
    if (length(x) > 0) {
      return(as.numeric(x))
    } else {
      return(NA)    
    }
  })
}
wine_df = 
  read_csv(
  "./wine_data/winemag-data-130k-v2.csv") %>% 
  select(-region_2, -taster_twitter_handle) %>% 
  mutate(year = year_extract(title))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   country = col_character(),
##   description = col_character(),
##   designation = col_character(),
##   points = col_double(),
##   price = col_double(),
##   province = col_character(),
##   region_1 = col_character(),
##   region_2 = col_character(),
##   taster_name = col_character(),
##   taster_twitter_handle = col_character(),
##   title = col_character(),
##   variety = col_character(),
##   winery = col_character()
## )
### remove region 2, taster twitter and missing values in region 1.

#wine_type <- read_csv("./wine_data/winemag-data-130k-v2.csv") %>% 
#            group_by(variety) %>% 
#            count() %>% 
#            arrange(desc(n)) %>% 
#            as.tibble()

Separate wine types by four major types: white, red, sparkling

https://media.winefolly.com/Different-Types-of-Wine-v2.jpg

Subset data to four types of wine: red, white, sparkling, and rose.

wine_type = 
    read_xlsx(
  "./wine_data/wine type.xlsx")  

wine_df = 
  wine_df %>% 
  mutate(type = ifelse(variety %in% wine_type$red, "red", 
                       ifelse(variety %in% wine_type$white, "white", 
                              ifelse(variety %in% wine_type$sparkling, "sparkling", 
                                     ifelse(variety %in% wine_type$rose, "rose", NA )))))

red_df = 
  wine_df %>% 
    filter(!is.na(type),
           type == "red")
  
white_df = 
  wine_df %>% 
    filter(!is.na(type),
           type == "white")

sparkling_df = 
  wine_df %>% 
    filter(!is.na(type),
           type == "sparkling")

rose_df = 
  wine_df %>% 
    filter(!is.na(type),
           type == "rose")

Make a plot of distribution of price/rating by type

wine_df %>% 
  filter(!is.na(type),
         price <= 200) %>% 
  ggplot(aes(x = type, y = price, color = type)) +
  geom_violin()

wine_df %>% 
  filter(!is.na(type)) %>% 
  rename(rating = points) %>% 
  ggplot(aes(x = type, y = rating, color = type)) +
  geom_violin()

Make a plot of distribution of price/rating by region

y <- list(
  title = "Mean Price"
)
wine_df %>% 
  filter(!is.na(price)) %>% 
  group_by(country) %>% 
  summarise(mean = mean(price)) %>% 
  mutate(country = fct_reorder(country, mean),
         mean = round(mean, 2),
        text_label=str_c("Country:", country, "\nmean price:", mean)) %>% 
  plot_ly(
  x = ~country, y = ~mean, color = ~country, text = ~text_label, 
  type = "bar", colors = "viridis") %>% 
  layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: Ignoring 1 observations
y <- list(
  title = "Mean rating"
)
wine_df %>% 
  filter(!is.na(points)) %>% 
  group_by(country) %>% 
  summarise(mean = mean(points)) %>% 
  mutate(country = fct_reorder(country, mean),
         mean = round(mean, 2),
        text_label=str_c("Country:", country, "\nmean rating:", mean)) %>% 
  plot_ly(
  x = ~country, y = ~mean, color = ~country, text = ~text_label, 
  type = "bar", colors = "viridis") %>% 
  layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring 1 observations

Even though Switzerland has the highest priced wine, the average ratings is only ranked at 11.

Make plots of ratings by year by wine type.

wine_df %>% 
  filter(!is.na(points),
         !is.na(type),
         year > 1900) %>% 
  group_by(year,type) %>% 
  summarise(mean = mean(points)) %>% 
  mutate(mean = round(mean, 2)) %>% 
  ggplot(aes(x = year, y = mean, color = type))+
  facet_wrap(~type)+
  geom_point()+
  geom_line() +
  labs(y = "Mean rating",
      x = "Year",
      title = "change in mean ratings by wine type and year")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

wine_df %>% 
  filter(!is.na(price),
         !is.na(type),
         year > 1900) %>% 
  group_by(year,type) %>% 
  summarise(mean = mean(price)) %>% 
  mutate(mean = round(mean, 2)) %>% 
  ggplot(aes(x = year, y = mean, color = type))+
  facet_wrap(~type)+
  geom_point()+
  geom_line() +
  labs(y = "Mean price",
      x = "Year",
      title = "change in mean price by wine type and year")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)